Temporal overview

p_year %>% 
  inner_join(poems,by=c("p_id")) %>%
  count(collection,year) %>%
  mutate(measure="yearly count") %>%
  union_all(
    p_year %>% # 10 year rolling mean
    distinct(year) %>% 
    left_join(p_year %>% distinct(year),sql_on="RHS.year BETWEEN LHS.year-5 AND LHS.year+5") %>%
    inner_join(p_year,by=c("year.y"="year")) %>%
    inner_join(poems,by=c("p_id")) %>%
    group_by(collection=collection,year=year.x) %>%
    summarize(n=n()/10,.groups="drop") %>%
    mutate(measure="10 year rolling mean")
  ) %>%
  filter(year>0,year<9999,collection!="literary") %>%
  ggplot(aes(x=year,y=n,color=measure)) +
  geom_point(data=~.x %>% filter(measure=="yearly count")) +
  geom_line(data=~.x %>% filter(measure=="10 year rolling mean")) +
  theme_hsci_discrete(base_family="Arial") + 
  theme(legend.justification=c(0,1), legend.position=c(0.02, 0.98), legend.background = element_blank(), legend.key=element_blank()) + 
  labs(color=NULL) +
  scale_y_continuous(breaks=seq(0,20000,by=2000),labels=scales::comma_format()) +
  ylab("Poems") +
  scale_x_continuous(breaks=seq(1000,2000,by=50)) +
  xlab("Year") +
  facet_wrap(~collection, ncol=1) +
  ggtitle("Number of poems by year and collection")

p_year %>% 
  filter(year %in% c(0,9999)) %>% 
  left_join(poems) %>% 
  count(collection,year) %>%
  ungroup() %>%
  gt() %>%
  tab_header(title="Abnormal years") %>%
  fmt_integer(n)
Abnormal years
collection year n
skvr 9999 469
erab 0 5,443

Overview of collectors

poems %>% 
  distinct(collection) %>%
  pull() %>%
  map(~p_col %>% 
    inner_join(poems %>% filter(collection==.x),by=c("p_id")) %>%
    count(col_id) %>%
    left_join(collectors,by=c("col_id")) %>%
    select(col_id,name,n) %>%
    collect() %>%
    mutate(col_id=fct_reorder(str_c(col_id,"|",name),n)) %>%
    mutate(col_id=fct_lump_n(col_id,n=100,w=n)) %>%
    mutate(col_id=fct_relevel(col_id,"Other")) %>%
    group_by(col_id) %>%
    tally(wt=n) %>% {
      ggplot(.,aes(x=col_id,y=n)) +
      geom_col() +
      geom_text(aes(label=p(n)),hjust='left',nudge_y = 100) +
      theme_hsci_discrete(base_family="Arial") +
      coord_flip() +
      labs(title=str_c("Collectors in ",.x))
    }
  )
## Warning: 1 unknown level in `f`: Other
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

p_col %>% 
  anti_join(collectors) %>%
  count(col_id) %>%
  gt() %>%
  tab_header(title="Collectors without a name") %>%
  fmt_integer(n)
Collectors without a name
col_id n
6873 106

Geographical overview

d <- p_loc %>% 
  count(loc_id) %>% 
  inner_join(locations) %>%
  select(name,n) %>%
  collect()

poems_without_location <- poems %>% 
  anti_join(p_loc) %>% 
  count() %>% 
  pull()

unprojected_locations <- d %>%
  anti_join(polygons) %>%
  add_row(name=NA,n=poems_without_location)
polygons %>%
  left_join(d) %>%
  tm_shape() +
  tm_polygons(col='n', id='name', style='fisher', palette='plasma') +
  tm_layout(title=str_c("Geographical overview. Missing ",unprojected_locations %>% tally(wt=n) %>% pull() %>% p," poems."))

Poem locations not mapped

unprojected_locations %>%
  arrange(desc(n)) %>%
  gt() %>%
  tab_header("Poem locations not mapped") %>%
  fmt_integer(n)
Poem locations not mapped
name n
Narvusi 4,982
NA 4,326
Uusikirkko Vpl 1,934
välismaa 1,822
Vuole 1,679
Pyhäjärvi Vpl 1,000
Tartu 973
Viena 871
Peräpohjola 842
Pohjois-Pohjanmaa 809
Itä- ja Pohjois-Inkeri 799
Etelä-Karjala 770
Tallinn 685
Viljandi l. 602
Pärnu l. 598
Hämeenlinna 565
Keski-Inkeri 532
Viljandimaa 480
Tulomajärvi 468
Uusikaupunki 457
Võrumaa 452
Kiimaisjärvi 448
Viron Inkeri 433
Länsi-Inkeri 426
Narva l. 422
Sortavala mlk 419
Etelä-Savo 412
Tveri 404
Länsipohja 402
Viipuri mlk 400
Keminmaa 375
Pieksämäki 346
Läänemaa 329
Pyhäjärvi Ol 324
Rakvere l. 304
Savonlinna 254
Pyhäjärvi Ul 245
Pohjois-Karjala 240
Säräisniemi 233
Salo 209
Pohjois-Savo 208
Saaremaa 205
Lahti 189
Etelä-Pohjanmaa 181
Uusikirkko Tl 177
Tornio 172
Mikkeli mlk 161
Valga 150
Koski Hl 142
Porvoo mlk 134
Kainuu 119
Heinola mlk 117
Kuusankoski 115
Mänttä 113
Laatokan Karjala (Raja-Karjala) 107
Salo 107
Riihimäki 104
Helsingin pit 103
Pärnumaa 102
Tuutari=Tuuteri 100
Kotka 96
Satakunta 95
Alajärvi 91
Ruija 91
Paide l. 88
Lapväärtti-Lappfjärd 87
Haapsalu 86
Kuopio mlk 77
Toijala 70
Jyväskylä mlk 69
Häme 67
Lappeenranta 63
Varkaus 62
Kajaani mlk 60
Võru l. 60
Valgamaa 58
Karjala Tl 51
Pohjois-Pirkkala 47
Kovero 47
Uusimaa 46
Sõrve 40
Järvamaa 39
Virumaa 36
Novgorodin alue 34
Tartumaa 29
Varsinais-Suomi 26
Hiiumaa 23
Vaasa 22
Revonlahti-Revolax 22
Storfjord 20
Parainen 18
Lappi Tl 17
Vaala 17
Harjumaa 17
Karkkila 16
Kouvola 16
Lieksa 16
Rovaniemi mlk 16
Prunkkala 15
Kimito 15
Kerava 15
Valkeakoski 15
Kuusisto 13
Uusikaupunki mlk 13
Iisalmi mlk 13
Kokkola-Gamlakarleby 13
Muoslompolo 13
Hongonjoki 12
Mustasaari-Korsholm 12
Kemiö 11
Pieksämäki mlk 11
Pielisensuu 11
Alakiiminki 11
Särkisalo 10
Hamina 10
Kristiinankaupunki-Kristinestad 10
Siipyy-Sideby 10
Nauvo 9
Taipale (Enontaipale) 9
Junosuando 9
Karesuando 9
Kenjärvi 9
Jämsänkoski 8
Kuhmoniemi 8
Kuolajärvi 8
Äänislinna 8
Sulva-Solf-Solv 7
Muurmanni 7
Siuntio 6
Suolahti 6
Taipale 6
Uzmana 6
Suma 6
Vammala 5
Rauma mlk 5
Kistrand 5
Moseija 5
Kirkkonummi 4
Imatra 4
Uusikaarlepyy-Nykarleby 4
Yliveteli 4
Lauritsala 3
Ruukki 3
Koutokeino 3
Riipuskala 3
Kuressaare l. 3
Järvenpää 2
Pietarsaari-Jakobstad 2
Jepua-Jeppo 2
Kouta 2
Tiudia 2
Jaama 2
Ikaalinen mlk 1
Anjalankoski 1
Myllykoski 1
Loviisa 1
Sipoo 1
Keski-Suomi 1
Nurmes mlk 1
Petolahti-Petalax 1
Raippaluoto-Replot 1
Oulu mlk 1
Nordkapp 1
Maasöy 1
Muodoslompolo 1
Pietari=Leningrad 1
Siestarjoki 1
Ahvenanmaa 1

Geographical overview by collection

d <- p_loc %>% 
  left_join(poems) %>%
  count(collection,loc_id) %>% 
  ungroup() %>%
  inner_join(locations) %>%
  select(collection,name,n) %>%
  collect()

poems_without_location <- poems %>% 
  anti_join(p_loc) %>% 
  count(collection) %>% 
  collect() %>%
  mutate(name=NA_character_)

unprojected_locations <- d %>%
  anti_join(polygons) %>%
  union_all(poems_without_location)
poems %>% 
  distinct(collection) %>%
  pull() %>%
  map(~
    tm_shape(
      polygons %>%
        left_join(
          p_loc %>% 
            inner_join(poems %>% filter(collection==.x),by=c("p_id")) %>%
            count(loc_id) %>% 
            inner_join(locations) %>%
            select(name,n) %>%
            collect()
        )
    ) +
    tm_polygons(col='n', id='name', style='fisher', palette='plasma') +
    tm_layout(title=str_c("Geography of ",.x,". Missing ",unprojected_locations %>% filter(collection==.x) %>% tally(wt=n) %>% pull() %>% p," poems."))
  )
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

Poem locations not mapped by collection

poems %>% 
  distinct(collection) %>%
  pull() %>%
  map(~
    unprojected_locations %>%
      filter(collection==.x) %>%
      arrange(desc(n)) %>%
      select(-collection) %>%
      gt() %>%
      tab_header(str_c("Poem locations not mapped in ",.x)) %>%
      fmt_integer(n)
  )
[[1]]
Poem locations not mapped in skvr
name n
Narvusi 3,202
Vuole 1,524
Uusikirkko Vpl 1,131
Pyhäjärvi Vpl 785
Etelä-Karjala 658
Pohjois-Pohjanmaa 557
Länsipohja 396
Kiimaisjärvi 363
Etelä-Savo 350
Tveri 294
Viipuri mlk 277
Salo 198
Pieksämäki 183
Pyhäjärvi Ol 160
Koski Hl 134
Pohjois-Karjala 129
Uusikirkko Tl 116
Tulomajärvi 115
Tornio 113
Porvoo mlk 112
Pyhäjärvi Ul 102
Pohjois-Savo 98
Viena 98
Satakunta 83
Säräisniemi 74
Uusikaupunki 71
Heinola mlk 62
Häme 61
Ruija 60
Jyväskylä mlk 52
Keski-Inkeri 49
Hämeenlinna 48
Kovero 47
Tuutari=Tuuteri 45
Laatokan Karjala (Raja-Karjala) 44
Itä- ja Pohjois-Inkeri 42
Kainuu 38
Kuopio mlk 35
Peräpohjola 33
Varkaus 30
Kajaani mlk 30
Etelä-Pohjanmaa 27
Keminmaa 26
Varsinais-Suomi 24
Mänttä 22
Novgorodin alue 22
Kotka 20
Mikkeli mlk 17
Karjala Tl 16
Alajärvi 16
Kimito 15
Prunkkala 14
Savonlinna 14
Sortavala mlk 13
Revonlahti-Revolax 12
Alakiiminki 11
Taipale (Enontaipale) 9
Helsingin pit 7
Sulva-Solf-Solv 7
Siuntio 6
Mustasaari-Korsholm 6
Siipyy-Sideby 6
Uzmana 6
Suma 6
Lapväärtti-Lappfjärd 5
Vaasa 5
Lappeenranta 4
Kenjärvi 4
Vammala 3
Kirkkonummi 3
Kokkola-Gamlakarleby 3
Kuolajärvi 3
Uusimaa 2
Iisalmi mlk 2
Pietarsaari-Jakobstad 2
Jepua-Jeppo 2
Kouta 2
Länsi-Inkeri 2
Uusikaupunki mlk 1
Anjalankoski 1
Keski-Suomi 1
Lieksa 1
Raippaluoto-Replot 1
Kuhmoniemi 1
[[2]]
Poem locations not mapped in erab
name n
NA 2,008
välismaa 1,822
Tartu 973
Tallinn 685
Viljandi l. 602
Pärnu l. 598
Viljandimaa 480
Võrumaa 452
Narva l. 422
Läänemaa 329
Rakvere l. 304
Saaremaa 205
Valga 150
Pärnumaa 102
Paide l. 88
Haapsalu 86
Võru l. 60
Valgamaa 58
Sõrve 40
Järvamaa 39
Virumaa 36
Tartumaa 29
Hiiumaa 23
Harjumaa 17
Kuressaare l. 3
[[3]]
Poem locations not mapped in jr
name n
Narvusi 1,780
NA 1,591
Peräpohjola 809
Uusikirkko Vpl 803
Viena 773
Itä- ja Pohjois-Inkeri 757
Hämeenlinna 517
Keski-Inkeri 483
Viron Inkeri 433
Länsi-Inkeri 424
Sortavala mlk 406
Uusikaupunki 386
Tulomajärvi 353
Keminmaa 349
Pohjois-Pohjanmaa 252
Savonlinna 240
Pyhäjärvi Vpl 215
Lahti 189
Pyhäjärvi Ol 164
Pieksämäki 163
Säräisniemi 159
Vuole 155
Etelä-Pohjanmaa 154
Mikkeli mlk 144
Pyhäjärvi Ul 143
Viipuri mlk 123
Kuusankoski 115
Etelä-Karjala 112
Pohjois-Karjala 111
Pohjois-Savo 110
Tveri 110
Salo 107
Riihimäki 104
Helsingin pit 96
Mänttä 91
Kiimaisjärvi 85
Lapväärtti-Lappfjärd 82
Kainuu 81
Kotka 76
Alajärvi 75
Toijala 70
Laatokan Karjala (Raja-Karjala) 63
Etelä-Savo 62
Uusikirkko Tl 61
Lappeenranta 59
Tornio 59
Heinola mlk 55
Tuutari=Tuuteri 55
Pohjois-Pirkkala 47
Uusimaa 44
Kuopio mlk 42
Karjala Tl 35
Varkaus 32
Ruija 31
Kajaani mlk 30
Porvoo mlk 22
Storfjord 20
Parainen 18
Lappi Tl 17
Jyväskylä mlk 17
Vaasa 17
Vaala 17
Karkkila 16
Kouvola 16
Rovaniemi mlk 16
Kerava 15
Valkeakoski 15
Lieksa 15
Kuusisto 13
Muoslompolo 13
Uusikaupunki mlk 12
Hongonjoki 12
Satakunta 12
Novgorodin alue 12
Kemiö 11
Salo 11
Pieksämäki mlk 11
Iisalmi mlk 11
Pielisensuu 11
Särkisalo 10
Hamina 10
Kokkola-Gamlakarleby 10
Kristiinankaupunki-Kristinestad 10
Revonlahti-Revolax 10
Nauvo 9
Junosuando 9
Karesuando 9
Jämsänkoski 8
Koski Hl 8
Äänislinna 8
Kuhmoniemi 7
Muurmanni 7
Häme 6
Suolahti 6
Taipale 6
Mustasaari-Korsholm 6
Länsipohja 6
Rauma mlk 5
Kuolajärvi 5
Kistrand 5
Moseija 5
Kenjärvi 5
Imatra 4
Uusikaarlepyy-Nykarleby 4
Siipyy-Sideby 4
Yliveteli 4
Lauritsala 3
Ruukki 3
Koutokeino 3
Riipuskala 3
Varsinais-Suomi 2
Vammala 2
Järvenpää 2
Tiudia 2
Jaama 2
Prunkkala 1
Ikaalinen mlk 1
Myllykoski 1
Kirkkonummi 1
Loviisa 1
Sipoo 1
Nurmes mlk 1
Petolahti-Petalax 1
Oulu mlk 1
Nordkapp 1
Maasöy 1
Muodoslompolo 1
Pietari=Leningrad 1
Siestarjoki 1
Ahvenanmaa 1
[[4]]
Poem locations not mapped in literary
name n
NA 727

Spatiotemporal overview

d <- poems %>%
  left_join(p_year %>% mutate(year=if_else(year %in% c(0L,9999L),NA,year))) %>% 
  collect() %>%
  mutate(year_ntile=ntile(year,11)) %>%
  group_by(year_ntile) %>%
  mutate(years=str_c(min(year),"-",max(year))) %>%
  ungroup() %>%
  left_join(p_loc %>% collect()) %>% 
  count(years,loc_id) %>% 
  ungroup() %>%
  left_join(locations %>% select(loc_id,name) %>% collect())
polygons %>% 
  left_join(d %>% complete(name,years)) %>%
  tm_shape() +
  tm_polygons(col='n', id='name', style='fisher', palette='plasma') +
  tm_layout(main.title="Geographical overviews by time",legend.outside.size=0.1) +
  tm_facets(by="years",ncol=4)

Poem verse statistics

Line types

d <- verses %>% 
  left_join(verse_poem) %>% 
  left_join(poems) %>% 
  count(collection,type) %>% 
  ungroup() %>%
  arrange(collection,desc(n)) %>%
  collect()
d %>% 
  group_by(collection) %>%
  mutate(proportion=n/sum(n)) %>%
  gt() %>%
  fmt_integer(n) %>%
  fmt_percent(proportion)
type n proportion
skvr
V 1,340,987 94.63%
L 44,303 3.13%
CPT 27,869 1.97%
K 3,931 0.28%
erab
V 1,861,583 93.39%
PAG 53,040 2.66%
CPT 19,844 1.00%
L 18,465 0.93%
TYH 18,357 0.92%
REF 17,869 0.90%
LRY 3,868 0.19%
RRE 307 0.02%
MRK 52 0.00%
U 38 0.00%
LLI 2 0.00%
TYP 1 0.00%
jr
V 812,343 90.94%
L 49,411 5.53%
CPT 28,030 3.14%
K 3,502 0.39%
literary
V 82,460 97.54%
L 1,220 1.44%
CPT 777 0.92%
K 87 0.10%

Verse line lengths

d_nr_characters <- verses_cl %>%
  mutate(nr_characters=str_length(text)) %>%
  left_join(verse_poem) %>% 
  left_join(poems) %>% 
  count(collection,nr_characters) %>% 
  ungroup() %>%
  arrange(collection,desc(n)) %>%
  collect()

d_nr_words <- word_occ %>%
  group_by(v_id) %>%
  summarise(nr_words=max(pos),.groups="drop") %>%
  left_join(verse_poem) %>%
  left_join(poems) %>% 
  count(collection,nr_words) %>% 
  ungroup() %>%
  arrange(collection,desc(n)) %>%
  collect()
## Warning: Missing values are always removed in SQL aggregation functions.
## Use `na.rm = TRUE` to silence this warning
## This warning is displayed once every 8 hours.

Verse line lengths in characters

d_nr_characters %>% 
  filter(nr_characters<=60) %>%
  ggplot(aes(x=nr_characters,y=n)) +
  geom_col(width=1) +
  facet_wrap(~collection,scales="free_y") +
  theme_hsci_discrete(base_family="Arial") +
  scale_y_continuous(labels=scales::comma_format()) +
  xlab("Number of characters") +
  ylab("Verses") +
  labs(title="Number of characters in verse lines")

d_nr_characters %>% 
  group_by(collection) %>%
  mutate(prop=n/sum(n)) %>%
  ungroup() %>%
  filter(nr_characters<=60) %>%
  ggplot(aes(x=nr_characters,y=collection,fill=collection,height=prop)) +
  geom_density_ridges(stat='identity') +
  theme_hsci_discrete(base_family="Arial") +
#  scale_y_continuous(labels=scales::percent_format()) +
  xlab("Number of characters") +
  ylab("Verses") +
  labs(title="Number of characters in verse lines")

Verse lines with more than 60 characters

d_nr_characters %>% 
  mutate(nl=if_else(nr_characters>60,n,0L)) %>%
  group_by(collection) %>%
  summarise(lines=sum(nl),proportion=sum(nl)/sum(n),.groups="drop") %>%
  arrange(desc(lines)) %>%
  gt() %>%
  tab_header(title="Verse lines with more than 60 characters") %>%
  fmt_integer(lines) %>%
  fmt_percent(proportion)
Verse lines with more than 60 characters
collection lines proportion
jr 1,911 0.24%
erab 291 0.02%
skvr 202 0.02%
literary 1 0.00%

Verse line lengths in words

d_nr_words %>% 
  filter(nr_words<=10) %>%
  ggplot(aes(x=nr_words,y=n)) +
  geom_col(width=1) +
  facet_wrap(~collection,scales="free_y") +
  scale_x_continuous(breaks=seq(0,10,by=2)) +
  scale_y_continuous(labels=scales::comma_format()) +
  theme_hsci_discrete(base_family="Arial") +
  xlab("Number of words") +
  ylab("Verses") +
  labs(title="Number of words in verse lines")

d_nr_words %>% 
  filter(nr_words<=10) %>%
  uncount(n) %>%
  ggplot(aes(x=nr_words,y=collection,fill=collection)) +
  stat_binline(binwidth=1) +
  theme_hsci_discrete(base_family="Arial") +
  scale_x_continuous(breaks=seq(0,10,by=2)) +
  xlab("Number of words") +
  ylab("Verses") +
#  scale_y_continuous(labels=scales::percent_format()) +
  labs(title="Number of words in verse lines")

Verse lines with more than 10 words

d_nr_words %>% 
  mutate(nl=if_else(nr_words>10,n,0L)) %>%
  group_by(collection) %>%
  summarise(lines=sum(nl),proportion=sum(nl)/sum(n),.groups="drop") %>%
  arrange(desc(lines)) %>%
  gt() %>%
  tab_header(title="Verse lines with more than 10 words") %>%
  fmt_integer(lines) %>%
  fmt_percent(proportion)
Verse lines with more than 10 words
collection lines proportion
jr 839 0.11%
erab 257 0.01%
skvr 38 0.00%
literary 9 0.01%
verse_nr_words <- word_occ %>% 
  group_by(v_id) %>%
  summarise(nr_words=max(pos)) %>%
  compute_a(unique_indexes=list(c("v_id","nr_words")))

word_nr_characters <- words %>%
  mutate(nr_characters=str_length(text)) %>%
  select(w_id,nr_characters) %>%
  compute_a(unique_indexes=list(c("w_id","nr_characters")))

d <- word_occ %>%
  left_join(word_nr_characters) %>%
  left_join(verse_nr_words) %>%
  left_join(verse_poem %>% select(-pos),by=c("v_id")) %>% 
  left_join(poems) %>% 
  count(collection,nr_words,pos,nr_characters) %>%
  collect()
d %>%
  group_by(collection,nr_words,pos) %>%
  mutate(prop=n/sum(n)) %>%
  ungroup() %>%
  filter(nr_words>=2,nr_words<6) %>%
  mutate(nr_words=as_factor(nr_words),pos=as_factor(pos)) %>%
  uncount(n) %>%
  ggplot(aes(x=nr_characters,y=nr_words,fill=nr_words)) +
  stat_binline(binwidth=1) +
  facet_grid(collection~pos,labeller = labeller(pos=label_both)) + 
  xlab("Number of characters in word") +
  ylab("Number of words in verse") +
  labs(
    title="Number of characters in words by their position",
    subtitle="According to length of verse and collection"
    ) +
  theme_hsci_discrete(base_family="Arial")